Unemployment is measured by the unemployment rate which is the number of people who are unemployed as a percentage of the total labour force. We have seen a sharp increase in the unemployment rate during Covid-19, so analyzing the unemployment rate can be a good data science project.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import plotly.express as py
df = pd.read_csv(r"C:\Users\mprat\OneDrive\Desktop\Unemployment in India.csv")
df.head()
| Region | Date | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | Area | |
|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 31-05-2019 | Monthly | 3.65 | 11999139.0 | 43.24 | Rural |
| 1 | Andhra Pradesh | 30-06-2019 | Monthly | 3.05 | 11755881.0 | 42.05 | Rural |
| 2 | Andhra Pradesh | 31-07-2019 | Monthly | 3.75 | 12086707.0 | 43.50 | Rural |
| 3 | Andhra Pradesh | 31-08-2019 | Monthly | 3.32 | 12285693.0 | 43.97 | Rural |
| 4 | Andhra Pradesh | 30-09-2019 | Monthly | 5.17 | 12256762.0 | 44.68 | Rural |
df.head(10)
| Region | Date | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | Area | |
|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 31-05-2019 | Monthly | 3.65 | 11999139.0 | 43.24 | Rural |
| 1 | Andhra Pradesh | 30-06-2019 | Monthly | 3.05 | 11755881.0 | 42.05 | Rural |
| 2 | Andhra Pradesh | 31-07-2019 | Monthly | 3.75 | 12086707.0 | 43.50 | Rural |
| 3 | Andhra Pradesh | 31-08-2019 | Monthly | 3.32 | 12285693.0 | 43.97 | Rural |
| 4 | Andhra Pradesh | 30-09-2019 | Monthly | 5.17 | 12256762.0 | 44.68 | Rural |
| 5 | Andhra Pradesh | 31-10-2019 | Monthly | 3.52 | 12017412.0 | 43.01 | Rural |
| 6 | Andhra Pradesh | 30-11-2019 | Monthly | 4.12 | 11397681.0 | 41.00 | Rural |
| 7 | Andhra Pradesh | 31-12-2019 | Monthly | 4.38 | 12528395.0 | 45.14 | Rural |
| 8 | Andhra Pradesh | 31-01-2020 | Monthly | 4.84 | 12016676.0 | 43.46 | Rural |
| 9 | Andhra Pradesh | 29-02-2020 | Monthly | 5.91 | 11723617.0 | 42.83 | Rural |
df.tail()
| Region | Date | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | Area | |
|---|---|---|---|---|---|---|---|
| 763 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 764 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 765 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 766 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 767 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
df.tail(10)
| Region | Date | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | Area | |
|---|---|---|---|---|---|---|---|
| 758 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 759 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 760 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 761 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 762 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 763 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 764 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 765 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 766 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 767 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
df.shape
(768, 7)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Region 740 non-null object 1 Date 740 non-null object 2 Frequency 740 non-null object 3 Estimated Unemployment Rate (%) 740 non-null float64 4 Estimated Employed 740 non-null float64 5 Estimated Labour Participation Rate (%) 740 non-null float64 6 Area 740 non-null object dtypes: float64(3), object(4) memory usage: 42.1+ KB
df.columns
Index(['Region', ' Date', ' Frequency', ' Estimated Unemployment Rate (%)',
' Estimated Employed', ' Estimated Labour Participation Rate (%)',
'Area'],
dtype='object')
df.isnull().sum()
Region 28 Date 28 Frequency 28 Estimated Unemployment Rate (%) 28 Estimated Employed 28 Estimated Labour Participation Rate (%) 28 Area 28 dtype: int64
df.describe()
| Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | |
|---|---|---|---|
| count | 740.000000 | 7.400000e+02 | 740.000000 |
| mean | 11.787946 | 7.204460e+06 | 42.630122 |
| std | 10.721298 | 8.087988e+06 | 8.111094 |
| min | 0.000000 | 4.942000e+04 | 13.330000 |
| 25% | 4.657500 | 1.190404e+06 | 38.062500 |
| 50% | 8.350000 | 4.744178e+06 | 41.160000 |
| 75% | 15.887500 | 1.127549e+07 | 45.505000 |
| max | 76.740000 | 4.577751e+07 | 72.570000 |
x=df['Region']
print(x)
0 Andhra Pradesh
1 Andhra Pradesh
2 Andhra Pradesh
3 Andhra Pradesh
4 Andhra Pradesh
...
763 NaN
764 NaN
765 NaN
766 NaN
767 NaN
Name: Region, Length: 768, dtype: object
y = df[' Estimated Unemployment Rate (%)']
print(y)
0 3.65
1 3.05
2 3.75
3 3.32
4 5.17
...
763 NaN
764 NaN
765 NaN
766 NaN
767 NaN
Name: Estimated Unemployment Rate (%), Length: 768, dtype: float64
df_stats = df[[' Estimated Unemployment Rate (%)', ' Estimated Employed', ' Estimated Labour Participation Rate (%)']]
round(df_stats.describe(),2)
| Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | |
|---|---|---|---|
| count | 740.00 | 740.00 | 740.00 |
| mean | 11.79 | 7204460.03 | 42.63 |
| std | 10.72 | 8087988.43 | 8.11 |
| min | 0.00 | 49420.00 | 13.33 |
| 25% | 4.66 | 1190404.50 | 38.06 |
| 50% | 8.35 | 4744178.50 | 41.16 |
| 75% | 15.89 | 11275489.50 | 45.50 |
| max | 76.74 | 45777509.00 | 72.57 |
print(df)
Region Date Frequency Estimated Unemployment Rate (%) \
0 Andhra Pradesh 31-05-2019 Monthly 3.65
1 Andhra Pradesh 30-06-2019 Monthly 3.05
2 Andhra Pradesh 31-07-2019 Monthly 3.75
3 Andhra Pradesh 31-08-2019 Monthly 3.32
4 Andhra Pradesh 30-09-2019 Monthly 5.17
.. ... ... ... ...
763 NaN NaN NaN NaN
764 NaN NaN NaN NaN
765 NaN NaN NaN NaN
766 NaN NaN NaN NaN
767 NaN NaN NaN NaN
Estimated Employed Estimated Labour Participation Rate (%) Area
0 11999139.0 43.24 Rural
1 11755881.0 42.05 Rural
2 12086707.0 43.50 Rural
3 12285693.0 43.97 Rural
4 12256762.0 44.68 Rural
.. ... ... ...
763 NaN NaN NaN
764 NaN NaN NaN
765 NaN NaN NaN
766 NaN NaN NaN
767 NaN NaN NaN
[768 rows x 7 columns]
bg=py.bar(df,x='Region',y=' Estimated Unemployment Rate (%)' ,color='Region',
title='Unemployment Rate(state wise) by Bar Graph',template='plotly')
bg.update_layout(xaxis={'categoryorder':'total descending'})
bg.show()
plt.figure(figsize=(10, 7))
sns.set_style('whitegrid')
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="viridis")
plt.show()
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x2ca49155bd0>
fig = py.histogram(df, x='Region', y=' Estimated Unemployment Rate (%)', color='Region',
title='Unemployment Rate (Region Wise) Histogram')
fig.show()
plt.figure(figsize=(15, 13))
plt.title("Indian Unemployment")
sns.histplot(data=df, x=" Estimated Unemployment Rate (%)", hue="Region")
plt.xlabel("Estimated Unemployment Rate (%)")
plt.ylabel("Count")
plt.show()